import librosa
from librosa import display
from PIL import Image
from matplotlib import pyplot
from numpy import asarray
import matplotlib.pyplot as plt
import numpy as np
import cmath
import seaborn as sns
import scipy
import IPython.display as ipd
import math
from numpy.linalg import inv
import torch
import torchvision
from torchvision import datasets
import numpy as np
import time
import numpy
import matplotlib.pyplot as plt
import torch.nn as nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_clean_male, sr1 = librosa.load("train_clean_male.wav",sr=None)
S=librosa.stft(train_clean_male,n_fft=1024,hop_length=512)
ipd.display(ipd.Audio(train_clean_male,rate=16000))
sn,sr2=librosa.load("train_dirty_male.wav",sr=None)
X=librosa.stft(sn,n_fft=1024,hop_length=512)
ipd.display(ipd.Audio(sn,rate=16000))
mod_S=np.abs(S)
mod_X=np.abs(X)
print(np.shape(mod_S))
print(np.shape(mod_X))
As part of the Network we have used the Following Architecture:
1) Convolution Layer with kernel size = 2, stride =1 and 16 filters followed ny ReLu Activation.
2)Max Pooling with kernel size=2 and stride =1
3)Convolution Layer with kernel size = 2, stride =1 and 32 filters followed ny ReLu Activation.
4) Max Pooling with kernel size=2 and stride =2
5) Reducing the layer to fullyconnected layer with 8160 neurons (layer before the output layer)
6) The final layer is with 513 neurons with ReLu Activation as we need positive values as part of the output.
class onedcnn(nn.Module):
def __init__(self):
super(onedcnn, self).__init__()
self.layer1= nn.Sequential(
nn.Conv1d(1, 16, kernel_size=2, stride=1),
nn.ReLU(),
nn.MaxPool1d(2, stride=1))
self.layer2= nn.Sequential(
nn.Conv1d(16, 32, kernel_size=2, stride=1),
nn.ReLU(),
nn.MaxPool1d(2, stride=2))
self.layer3=torch.nn.Linear(255*16*2, 513)
self.act=nn.ReLU()
def forward(self, x):
out1 = self.layer1(x)
out2 = self.layer2(out1)
out3=out2.reshape(out2.size(0), out2.size(1)*out2.size(2))
out4=self.act(self.layer3(out3))
return out4
model1 = onedcnn().cuda()
# Loss and optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model1.parameters(),lr=0.0001)
errt=[0 for i in range(200)]
for epoch in range(200):
running_loss=0
for j in range(20):
# Move tensors to the configured device
if (j+1)*128 <= 2459:
images= torch.tensor(mod_X[:,j*128:(j+1)*128],device=device)
labels=torch.tensor(mod_S[:,j*128:(j+1)*128],device=device)
else:
images=torch.tensor(mod_X[:,j*128:2459],device=device)
labels = torch.tensor(mod_S[:,j*128:2459],device=device)
# Forward pass
transposed_images=torch.transpose(images, 0, 1).to(device)
transposed_images.resize_(images.size(1),1, 513)
outputs = model1(transposed_images).cuda()
# print(np.shape(outputs),np.shape(torch.transpose(labels,0,1)))
loss = criterion(outputs.cuda(), torch.transpose(labels,0,1))
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss+=loss.item()
errt[epoch]=running_loss/19
print("Epoch:", epoch,"Loss:",running_loss/19)
plt.figure()
plt.plot(errt)
plt.title('Convergence')
test_x_01,sr2=librosa.load("test_x_01.wav",sr=None)
ipd.display(ipd.Audio(test_x_01,rate=16000))
testx01=librosa.stft(test_x_01,n_fft=1024,hop_length=512)
testx01_abs=np.abs(testx01)
print(np.shape(testx01_abs))
test_x_01_tensor=torch.tensor(testx01_abs)
test_x_01_transpose=torch.transpose(test_x_01_tensor, 0, 1).to(device)
test_x_01_transpose.resize_(142,1, 513)
with torch.no_grad():
new_outputs_test_x_01=model1(test_x_01_transpose)
print(np.shape(new_outputs_test_x_01))
new_outputs1=torch.transpose(new_outputs_test_x_01,0,1)
new_output2=torch.div(torch.mul(torch.tensor(testx01,device=device),new_outputs1),torch.abs(torch.tensor(testx01,device=device)))
recovered_test_01_x=(new_output2.data).cpu().numpy()
signal_test_01_x=librosa.core.istft(recovered_test_01_x, hop_length=512)
ipd.display(ipd.Audio(librosa.core.istft(recovered_test_01_x, hop_length=512),rate=16000))
test_x_02,sr2=librosa.load("test_x_02.wav",sr=None)
ipd.display(ipd.Audio(test_x_02,rate=16000))
testx02=librosa.stft(test_x_02,n_fft=1024,hop_length=512)
testx02_abs=np.abs(testx02)
print(np.shape(testx02_abs))
test_x_02_tensor=torch.tensor(testx02_abs)
test_x_02_transpose=torch.transpose(test_x_02_tensor, 0, 1).to(device)
test_x_02_transpose.resize_(380,1, 513)
with torch.no_grad():
new_outputs_test_x_02=model1(test_x_02_transpose)
print(np.shape(new_outputs_test_x_02))
new_output11=torch.transpose(new_outputs_test_x_02,0,1)
new_output12=torch.div(torch.mul(torch.tensor(testx02,device=device),new_output11),torch.abs(torch.tensor(testx02,device=device)))
recovered_test_02_x=(new_output12.data).cpu().numpy()
signal_test_02_x= librosa.core.istft(recovered_test_02_x, hop_length=512)
ipd.display(ipd.Audio(librosa.core.istft(recovered_test_02_x, hop_length=512),rate=16000))
def snr(ground_clean, recovered_one):
import math
a=min(len(ground_clean),len(recovered_one))
ground1=ground_clean[0:a]
recovered1=recovered_one[0:a]
num=np.sum(np.square(ground1))
diff=np.sum(np.square(ground1-recovered1))
SNR=10*math.log(num/diff,10)
return SNR
mod_S_tensor=torch.tensor(mod_X)
mod_S_t=torch.transpose(mod_S_tensor, 0, 1).to(device)
mod_S_t.resize_(2459,1, 513)
with torch.no_grad():
new_outputs=model1(mod_S_t)
new_outputs13=torch.transpose(new_outputs,0,1)
new_output12=torch.div(torch.mul(torch.tensor(X,device=device),new_outputs13),torch.abs(torch.tensor(X,device=device)))
recovered_test_03_x=(new_output12.data).cpu().numpy()
signal_test_03_x= librosa.core.istft(recovered_test_03_x, hop_length=512)
ipd.display(ipd.Audio(librosa.core.istft(recovered_test_03_x, hop_length=512),rate=16000))
snr(train_clean_male,signal_test_03_x)